# coding:utf8

'''
#全国企业信用信息公示系统（湖南）
#维护肖迪
'''
from scpy.logger import get_logger

logger = get_logger(__file__)
import pycurl
import urllib
import re
from utils import kill_captcha
import StringIO
import random
from bs4 import BeautifulSoup
import json
import table
import requests


# def curl(url, data='', cookie='', debug=False):  #抓取函数[get,post]
#     UserAgent = "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
#     s = StringIO.StringIO()
#     c = pycurl.Curl()
#     c.setopt(c.URL, url)
#     c.setopt(pycurl.CONNECTTIMEOUT, 60)
#     c.setopt(pycurl.TIMEOUT, 120)
#     c.setopt(c.REFERER, 'http://gsxt.scaic.gov.cn/')
#     if cookie:
#         c.setopt(c.COOKIEJAR, "cookie_file_name1")
#     c.setopt(c.COOKIEFILE, "cookie_file_name1")
#     c.setopt(pycurl.FOLLOWLOCATION, True)
#     if data:
#         c.setopt(c.POSTFIELDS, urllib.urlencode(data))
#     c.setopt(pycurl.ENCODING, 'gzip')
#     c.setopt(c.HTTPHEADER, ['Host:gsxt.hnaic.gov.cn', 'Upgrade-Insecure-Requests:1',
#                             'User-Agent:Googlebot/2.1 (+http://www.googlebot.com/bot.html)',
#                             'Origin:http://gsxt.scaic.gov.cn'])
#     c.setopt(c.WRITEDATA, s)
#     c.perform()
#     c.close()
#     return s.getvalue()
def verify(name, **args):
    # global detail_url
    s = requests.Session()
    loop_num = 0
    while 1:
        loop_num += 1
        try:
            code = s.get('http://gsxt.hnaic.gov.cn/notice/search/ent_info_list').content
            code = re.findall('code:"(.+?)"', code.replace(' ', ''))[0]
            print code
            verify_url = "http://gsxt.hnaic.gov.cn/notice/captcha?preset=&ra=%d" % random.random()
            verify_image = s.get(verify_url).content

            # verify = kill_captcha(verify_image,'fj','jepg')
            # print verify
            # open('/Users/xiaodi/Desktop/工商/crawler-service/saic_service/crawler/%s.jpeg'%verify,'w').write(verify_image)
            data = {"condition.insType": "", "captcha": 1, "session.token": code, "condition.keyword": name,
                    "condition.pageNo": "1"}
            # print data
            search_url = 'http://gsxt.hnaic.gov.cn/notice/search/ent_info_list'
            html = s.post(search_url, data=data).content
            # open('/Users/xiaodi/Desktop/工商/crawler-service/saic_service/crawler/123.html','w').write(html)
            detail_url = re.findall('<a href="(http://gsxt\.hnaic\.gov\.cn/notice/notice/view.+?)"', html)
            if detail_url:
                detail_html = s.get(detail_url[0]).content
                return (detail_html, detail_url[0])
            else:
                return ''
        except Exception, e:
            logger.exception(e)
            if loop_num >= 20:
                logger.info('验证码尝试了20次，退出尝试')
                logger.error('保存word%s' % name)
                raise ValueError
                break
            logger.info('验证码错误，正在识别,错误次数%s' % loop_num)
            continue


def run(detail_html, type, **args):
    detail_url = detail_html[1]
    detail_html = detail_html[0]
    tables = re.findall('<table[\s\S]+?</table>', detail_html)
    for j in tables:
        word = re.findall('<th colspan="\d+?">(.+?)</th>', j)
        if '股东信息' in j:
            shareHolderList = table.index('股东信息', j)
            if shareHolderList:
                for i in shareHolderList:
                    if 'href' in i['shareHolderdetail']:
                        share_url = re.findall('href="(.+?)"', i['shareHolderdetail'])[0]
                        html = requests.get(share_url, timeout=60).content
                        html = html.replace(' ', '')
                        subConam = re.findall('invt\.subConAm="(.+?)"', html)[0]
                        conDate = table.time_clean(re.findall("invtActl\.conDate='(.*?)'", html.replace(' ', ''))[0])
                        fundedRatio = ""
                        regCapCur = re.findall('invt\.conForm="(.+?)"', html)[0]
                        country = ""
                        i['shareHolderdetail'] = share_url
                        i['subConam'] = subConam
                        i['conDate'] = conDate
                        i['fundedRatio'] = fundedRatio
                        i['regCapCur'] = regCapCur
                        i['country'] = country
                    else:
                        i['shareHolderdetail'] = ''
                        i['subConam'] = ''
                        i['conDate'] = ''
                        i['fundedRatio'] = ''
                        i['regCapCur'] = ''
                        i['country'] = ''
        try:
            if '基本信息' == word[0]:
                basicList = table.index(word[0].replace(' ', ''), j)
            # if '股东信息' == word[0]:
            #    shareHolderList = table.index('股东信息',j)
            if '主要人员信息' == word[0]:
                personList = table.index(word[0].replace(' ', ''), j)
            if '变更信息' == word[0]:
                alterList = table.index(word[0].replace(' ', ''), j)
            if '分支机构信息' == word[0]:
                filiationList = table.index(word[0].replace(' ', ''), j)
            if '清算信息' == word[0]:
                liquidationList = table.index(word[0].replace(' ', ''), j)
            if '经营异常' == word[0] or '经营异常信息' == word[0]:
                abnormalOperation = table.index(word[0].replace(' ', ''), j)
        except:
            print word
            continue
    try:
        print basicList
    except:
        basicList = []
    try:
        print shareHolderList
    except:
        shareHolderList = []
    try:
        print personList
    except:
        personList = []
    try:
        print alterList
    except:
        alterList = []
    try:
        print filiationList
    except:
        filiationList = []
    try:
        print liquidationList
    except:
        liquidationList = []
    try:
        print abnormalOperation
    except:
        abnormalOperation = []

    punishBreakList = []
    punishedList = []
    alidebtList = []
    # entinvItemList = [{"entName":"","entType":"","fundedRatio":"","currency":"","entStatus":"","canDate":"","esDate":"","regOrg":"","regCapcur":"","regCap":"","revDate":"","name":"","subConam":"","regNo":""}]
    entinvItemList = []
    frinvList = []
    frPositionList = []
    caseInfoList = []
    sharesFrostList = []
    sharesImpawnList = []
    morDetailList = []
    morguaInfoList = []
    report_url = detail_url.replace('tab=01', 'tab=02')
    html = requests.get(report_url, timeout=60).content
    report_url = re.findall('"(http://gsxt\.hnaic\.gov\.cn/notice/notice/view_annual.+?)" target="_blank">(\d+)', html)

    yearReportList = []
    yearSource = []
    for i in report_url:
        year = i[1]
        report_url1 = i[0]
        print i
        html = requests.get(report_url1, timeout=60).content
        # print html
        table_list = re.findall('<table[\s\S]+?</table>', html)
        for j in table_list:
            if '企业基本信息' in j:
                report_basic = table.report_basic(j)
            if '网站或网店信息' in j:
                report_website = table.report_website(j)
            if '企业资产状况信息' in j:
                report_assetsInfo = table.report_assetsInfo(j)
            if '股东及出资信息' in j:
                report_investorInformations = table.report_investorInformations(j)
            if '股权变更信息' in j:
                report_equityChangeInformations = table.report_equityChangeInformations(j)
            if '修改记录' in j:
                report_changeRecords = table.report_changeRecords(j)
            try:
                print report_basic
            except:
                report_basic = {}
            try:
                print report_website
            except:
                report_website = {}
            try:
                print report_assetsInfo
            except:
                report_assetsInfo = {}
            try:
                print report_investorInformations
            except:
                report_investorInformations = []
            try:
                print report_equityChangeInformations
            except:
                report_equityChangeInformations = []
            try:
                print report_changeRecords
            except:
                report_changeRecords = []
        ditSource = {"year": year, "html": html}
        dit1 = {"year": year, "baseInfo": report_basic, "website": report_website,
                "investorInformations": report_investorInformations, "assetsInfo": report_assetsInfo,
                "equityChangeInformations": report_equityChangeInformations, "changeRecords": report_changeRecords}
        yearReportList.append(dit1)
        yearSource.append(ditSource)
    alldata = {'province': 'hn', "abnormalOperation": abnormalOperation, "basicList": basicList,
               "shareHolderList": shareHolderList, "personList": personList, "punishBreakList": punishBreakList,
               "punishedList": punishedList, "alidebtList": alidebtList, "entinvItemList": entinvItemList,
               "frinvList": frinvList, "frPositionList": frPositionList, "alterList": alterList,
               "filiationList": filiationList, "caseInfoList": caseInfoList, "sharesFrostList": sharesFrostList,
               "sharesImpawnList": sharesImpawnList, "morDetailList": morDetailList, "morguaInfoList": morguaInfoList,
               "liquidationList": liquidationList, "yearReportList": yearReportList}
    if type == 1:
        result_source = {"province": "hn", "type": 0, "html": detail_html, "keyword": args.get("searchkey"),
                         "companyName": basicList[0]['enterpriseName'], "yearList": yearSource}
        companyUrl = {'url': detail_url, 'method': 'get', 'companyName': basicList[0]['enterpriseName'],
                      "province": "hn"}
        result_all = (result_source, alldata, companyUrl)
        return result_all
    return alldata


def search(key):
    html = verify(key)
    if html:
        result = run(html, type=0)
        return result
    else:
        return {}


def search2(key):
    html = verify(key)
    if html:
        result = run(html, type=1, searchkey=key)
        return result
    else:
        return ()


def search3(data):
    url = data.get('url')
    html = requests.get(url).content
    default = (html, url)
    result = run(default, type=1)
    return result


if __name__ == "__main__":
    # print  json.dumps(search2('湖南轻工湘海经贸有限公司'), ensure_ascii=False, )
    # print json.dumps(search3('http://gsxt.hnaic.gov.cn/notice/notice/view?uuid=aJB4n55N8sbaJ3BTvR06t6cgXjrAfm3M&tab=01'),ensure_ascii=False,indent=4)
    # print json.dumps(search2('湖南新环境房地产经纪连锁有限公司'),ensure_ascii=False,indent=4)
    # print verify('湖南华菱钢铁集团有限责任公司')
    # 福州市闽瑞家具有限公司

    req_data = {"url": "http://gsxt.hnaic.gov.cn/notice/notice/view?uuid=xtZIIcSDrsGfegPBIZF8E0GIlxBO65YX&tab=01", "province": "hn", "method": "get", "companyName": "湖南轻工湘海经贸有限公司"}
    print json.dumps(search3(req_data), indent=4, ensure_ascii=False)
